/*==============================================================================
step0700 - Clean up, ipolate missing values

Outline: 
I.   Create numeric ID, set tsset, perform tsfill and carryforward of initial conditions
II.  Create variables with the year of data for initial conditions
III. Interpolate population
IV.  Create variables:
	a) ue_rate
	b) hysteresis 
	c) home ownership
	d) urban shares
	e) GDP per capita
		PPP adjustment
	f) age structure
	g) average years of education
	h) employment structure
	i) inmigration and outmigration rates
	j) female employment share
	k) motorways density

==============================================================================*/ 
set more off 
use "$dta_files/step300_labor_IC.dta", clear

drop country
gen country=substr(nuts,1,2) 
replace country="US" if nuts=="Entire U.S."
replace country="CA" if nuts=="Canada"

keep if country=="AT"|country=="BE"|country=="CA"|country=="CH"|country=="ES"| ///
	country=="DE"|country=="DK"|country=="FI"|country=="FR"|country=="IT"| /// 
	country=="NL"|country=="SE"|country=="UK"|country=="US"

* ------------------------------------------------------------------------------
*I. Create numeric ID, set tsset, perform tsfill and carryforward of ICs  
* ------------------------------------------------------------------------------
egen id=group(nuts country)
order id 

sort year 
tsset id year
tsfill 
ssc install carryforward, all replace
bysort id: carryforward nuts country EMP_female EMP_share ind* serv* age* gdp* inmigration outmigration edatt* hh_size home* urban* area* total*  migration_yrs, replace 

* ------------------------------------------------------------------------------
*II.  Create variables with the year of data for ICs
* ------------------------------------------------------------------------------

gen init_yr_age = .
	replace  init_yr_age = 1971 if country=="AT"
	replace  init_yr_age = 1971 if country=="BE"	
	replace  init_yr_age = 1971 if country=="CA"
	replace  init_yr_age = 1970 if country=="CH"	
	replace  init_yr_age = 1971 if country=="DK"
	replace  init_yr_age = 1970 if country=="DE"
	replace  init_yr_age = 1971 if country=="DK"
	replace  init_yr_age = 1971 if country=="EL"
	replace  init_yr_age = 1971 if country=="ES"	
	replace  init_yr_age = 1976 if country=="FI"
	replace  init_yr_age = 1968 if country=="FR"	
	replace  init_yr_age = 1971 if country=="IT"
	replace  init_yr_age = 1971 if country=="NL"	
	replace  init_yr_age = 1970 if country=="PT"
	replace  init_yr_age = 1970 if country=="SE"
	replace  init_yr_age = 1971 if country=="UK"
	replace  init_yr_age = 1970 if country=="US"

	bys nuts: egen  init_yr_age_m = min( init_yr_age)
	drop  init_yr_age
	rename  init_yr_age_m  init_yr_age
	
gen init_yr_gdp = .
	replace init_yr_gdp = 1971 if country=="AT"
	replace init_yr_gdp = 1970 if country=="BE"
	replace init_yr_gdp = 1971 if country=="CA"	
	replace init_yr_gdp = 1970 if country=="CH"
	replace init_yr_gdp = 1970 if country=="DE"	
	replace init_yr_gdp = 1983 if country=="DK"
	replace init_yr_gdp = 1970 if country=="EL"
	replace init_yr_gdp = 1980 if country=="ES"
	replace init_yr_gdp = 1970 if country=="FR"	
	replace init_yr_gdp = 1970 if country=="FI"
	replace init_yr_gdp = 1970 if country=="IT"
	replace init_yr_gdp = 1970 if country=="NL"
	replace init_yr_gdp = 1970 if country=="PT"
	replace init_yr_gdp = 1974 if country=="SE"
	replace init_yr_gdp = 1970 if country=="UK"
	replace init_yr_gdp = 1970 if country=="US"
	
	bys nuts: egen init_yr_gdp_m = min(init_yr_gdp)
	drop init_yr_gdp
	rename init_yr_gdp_m init_yr_gdp
	
gen init_yr_migration = .
	replace init_yr_migration = 1979 if country=="AT"
	replace init_yr_migration = 1976 if country=="BE"
	replace init_yr_migration = 1971 if country=="CA" // First year POP data 	
	replace init_yr_migration = 1970 if country=="CH" 
	replace init_yr_migration = 1970 if country=="DE" 
	replace init_yr_migration = 1981 if country=="DK"
	replace init_yr_migration = 1968 if country=="EL"
	replace init_yr_migration = 1973 if country=="ES" 
	replace init_yr_migration = 1972 if country=="FR"
	replace init_yr_migration = 1970 if country=="FI"
	replace init_yr_migration = 1970 if country=="IT"
	replace init_yr_migration = 1971 if country=="NL" // First year POP data 
	replace init_yr_migration = 1976 if country=="PT"
	replace init_yr_migration = 1973 if country=="SE"
	replace init_yr_migration = 1971 if country=="UK"  
	replace init_yr_migration = 1968 if country=="US"
	
	bys nuts: egen init_yr_migration_m = min(init_yr_migration)
	drop init_yr_migration
	rename init_yr_migration_m init_yr_migration
	
gen init_yr_EMP_female = .
	replace  init_yr_EMP_female = 1971 if country=="AT"
	replace  init_yr_EMP_female = 1971 if country=="CA"
	replace  init_yr_EMP_female = 1970 if country=="CH"	
	replace  init_yr_EMP_female = 1971 if country=="DE"
	replace  init_yr_EMP_female = 1971 if country=="EL"	
	replace  init_yr_EMP_female = 1968 if country=="FR"
	replace  init_yr_EMP_female = 1970 if country=="US"

	bys nuts: egen  init_yr_EMP_female_m = min(init_yr_EMP_female)
	drop  init_yr_EMP_female
	rename  init_yr_EMP_female_m  init_yr_EMP_female

********************************************************************************	
*III. Interpolate population

gen POP_m = POP*1000 
replace POP_m = age_tot if year==init_yr_age & POP_m==.

sort nuts year
bys nuts: ipolate POP_m year, gen(POP_ipolate)
	drop POP_m

*===============================================================================
* IV. Create rates/other new variables
*===============================================================================
* ------------------------------------------------------------------------------
* (a) Gen unemployment rates and hysteresis measure
* ------------------------------------------------------------------------------
gen ue2_rate= UNEMP / (EMP+UNEMP) *100
label var ue2_rate "unemployment rate: computed"

* replace missing regions
foreach nuts in UKD6 UKD7 UKM6 {
	replace ue2_rate=ue_rate if ue2_rate==. & ue_rate~=. & nuts=="`nuts'"
}
	
* ------------------------------------------------------------------------------
* averages of unemployment & employment rates
* smooth out measurement errors and short term fluctuations
* ------------------------------------------------------------------------------

****************************   3 year increments   *****************************
quiet foreach var in  ue2_rate {
	noisily di "`var'"
	foreach i in 1970 1971 1972 1973 1974 1979 1980 1981 1989 1990 1991 {
		noisily di "     year: `i'"
		local i1=`i'+2
		capture drop xtemp_* ytemp_*
		forvalues j=`i'(1)`i1' {
			gen xtemp_`j'=`var' if year==`j'
			egen ytemp_`j'=max(xtemp_`j'), by(nuts)
		}
		egen av3_`var'_`i'_`i1'=rmean(ytemp_*)
	}
}

****************************   4 year increments   *****************************
quiet foreach var in  ue2_rate  {
	noisily di "`var'"
	foreach i in 1970 1971 1972 1973 1974 1975 1979 1980 1981 1988 1989 1990  {
		noisily di "     year: `i'"
		local i1=`i'+3
		capture drop xtemp_* ytemp_*
		forvalues j=`i'(1)`i1' {
			gen xtemp_`j'=`var' if year==`j'
			egen ytemp_`j'=max(xtemp_`j'), by(nuts)
		}
		egen av4_`var'_`i'_`i1'=rmean(ytemp_*)
	}
}
	
capture drop xtemp_* ytemp_*

* ------------------------------------------------------------------------------
* (b) 				Calculate hysteresis
* ------------------------------------------------------------------------------

capture drop ue2_hyst
gen ue2_hyst	 = log(1+av3_ue2_rate_1989_1991/100 - av3_ue2_rate_1980_1982/100)*100 ///
					+ log(1+av3_ue2_rate_1980_1982/100 - av4_ue2_rate_1970_1973/100)*100
					
tabstat ue2_hyst if nuts_level==2 & year==1970, by(country) stat(N mean sd min max)			

* ------------------------------------------------------------------------------
* (c) Home ownership
* ------------------------------------------------------------------------------

egen homes_all=rsum(home_own home_no_own home_unknown_own)
capture drop home_own_rate
gen home_own_rate=home_own/homes_all*100
	
* ------------------------------------------------------------------------------
* (d) urban shares
* ------------------------------------------------------------------------------

foreach year in 1990 1995 2000 {
	gen urban_share_100_y`year' 	= urban_pop_100_y`year'  / total_pop_`year'  *100
	gen urban_share_200_y`year' 	= urban_pop_200_y`year'  / total_pop_`year'  *100
	gen urban_share_300_y`year' 	= urban_pop_300_y`year'  / total_pop_`year'  *100
	gen urban_share_400_y`year' 	= urban_pop_400_y`year'  / total_pop_`year'  *100	
	gen urban_share_500_y`year' 	= urban_pop_500_y`year'  / total_pop_`year'  *100
	gen urban_share_600_y`year' 	= urban_pop_600_y`year'  / total_pop_`year'  *100
	gen urban_share_700_y`year' 	= urban_pop_700_y`year'  / total_pop_`year'  *100
	gen urban_share_800_y`year' 	= urban_pop_800_y`year'  / total_pop_`year'  *100
	gen urban_share_900_y`year' 	= urban_pop_900_y`year'  / total_pop_`year'  *100
	gen urban_share_1000_y`year' 	= urban_pop_1000_y`year' / total_pop_`year'  *100
}


egen age_total2=rsum(age_0_4-age_70_plus) 
	replace age_total=age_total2 if (age_total==.|age_total==0)
	drop age_total2

* ------------------------------------------------------------------------------
* (e) GDP per capita 
* ------------------------------------------------------------------------------

replace gdp_per_capita = gdp /  POP_ipolate if year==init_yr_gdp
	bys nuts: egen gdp_per_capita_m = min(gdp_per_capita)
	drop gdp_per_capita
	rename gdp_per_capita_m gdp_per_capita


* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* normalize GDP using PPP from Penn World Tables
* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

merge m:1 country using "$insheet_files/penn_world_tables_71.dta", nogen keep(match)

egen temp_country=group(country)
gen gdp_per_capita3_PPP=.

quiet forvalues i =1/18 {
		sum gdp_per_capita if temp_country==`i' & nuts_level==0 | (country=="DE" & nuts_level==2)
		local mean0=r(mean)
		sum pwt71_cgdp if temp_country==`i'
		local mean1=r(mean)
		replace gdp_per_capita3_PPP=gdp_per_capita*`mean1'/`mean0' if temp_country==`i'
	}
drop temp_country

gen logY_ppp3=log(gdp_per_capita3_PPP)*100
label var logY_ppp3 "log GDP per capita, PPP (PWT)"		

gen logY_ppp_base=logY_ppp3

* ------------------------------------------------------------------------------
* (f) Age structure 
* ------------------------------------------------------------------------------

gen share_age_0_14 = (age_0_4+age_5_9+age_10_14)/age_total*100
gen share_age_15_24 = (age_15_19+age_20_24)/age_total*100
gen share_age_0_24 = share_age_0_14+share_age_15_24

gen share_age_55_64 = (age_55_59+age_60_64)/age_total*100
gen share_age_65plus = (age_65_69 + age_70_plus)/age_total*100
gen share_age_55plus = share_age_55_64+share_age_65plus

* ------------------------------------------------------------------------------
* (g) Average Years of Education 
* ------------------------------------------------------------------------------

egen edatt_total2 = rsum(edatt_yrs*) 
	replace edatt_total = edatt_total2 if edatt_total==.
	drop edatt_total2

foreach var of varlist 	edatt_yrs* {
	replace `var' = 0 if `var'==.
}

gen year_school = edatt_yrs_0/edatt_total*0 + edatt_yrs_1/edatt_total*1 + /// 
edatt_yrs_2/edatt_total*2 + edatt_yrs_3/edatt_total*3 + /// 
edatt_yrs_4/edatt_total*4 + edatt_yrs_5/edatt_total*5 + ///
edatt_yrs_6/edatt_total*6 + edatt_yrs_7/edatt_total*7 + ///
edatt_yrs_8/edatt_total*8 + edatt_yrs_9/edatt_total*9 + ///
edatt_yrs_10/edatt_total*10 + edatt_yrs_11/edatt_total*11 + ///
edatt_yrs_12/edatt_total*12 +edatt_yrs_13/edatt_total*13 + ///
edatt_yrs_14/edatt_total*14 + edatt_yrs_15/edatt_total*15 + ///
edatt_yrs_16/edatt_total*16 + edatt_yrs_17/edatt_total*17 + ///
edatt_yrs_18/edatt_total*18 + edatt_yrs_19/edatt_total*19 + ///
edatt_yrs_20/edatt_total*20  + edatt_yrs_21/edatt_total*21 

********************************************************************************
* Educ data are for 1981. Extrapolate linear trend from 1970-1980 backwards
* According to de la Fuente and Domenech (2012) Table 8, pg. 10, for DK 
* average education increased from 10.65 years in 1970 to 11.01 years by 1980.
* For UK, average education increased from 7.58 years in 1970 to 8.48 years by 
* 1980.
********************************************************************************

replace year_school = 10.65 / 11.01 * year_school if country=="DK"
replace year_school = 7.58 / 8.48 * year_school if country=="UK"

* ------------------------------------------------------------------------------
* (h) Employment structure
* ------------------------------------------------------------------------------

gen total_employment=serv_total+ind_total+ind_agro

foreach var in ind_agro ind_energy ind_mining ind_construction ///
			   ind_total ind_metals ind_other ind_mnfg ///
			   serv_commerce serv_transport serv_credit ///
			   serv_admin serv_total serv_other {
	gen share_`var'=`var'/total_employment*100
}

gen share_emp_primary_sector = (ind_agro+ind_energy+ind_mining)/total_employment*100
label var share_emp_primary_sector "Share of employment in the primary sector"

gen share_emp_secondary_sector = (ind_total-ind_energy-ind_mining)/total_employment*100
label var share_emp_secondary_sector "Share of employment in the secondary sector"

gen share_emp_tertiary_sector = serv_total/total_employment*100
label var share_emp_tertiary_sector "Share of employment in the tertiary sector"

* ------------------------------------------------------------------------------
* (i) Inmigration and Outmigration Rates
* ------------------------------------------------------------------------------

gen inmigration_rate  = (inmigration/(migration_yrs)^(.7)) / POP_ipolate ///
	if year==init_yr_migration & country!="DE"
	bys nuts: egen inmigration_rate_m = min(inmigration_rate)
	drop inmigration_rate
	rename inmigration_rate_m inmigration_rate
	replace inmigration_rate = inmigration/migration_pop_base_de if ///
	country=="DE" //Germany has special population base due to the ability
	// to construct modern regions from district-level migration data
	
gen outmigration_rate = (outmigration/(migration_yrs)^(.7))/ POP_ipolate if ///
	year==init_yr_migration & country!="DE"
	bys nuts: egen outmigration_rate_m = min(outmigration_rate)
	drop outmigration_rate
	rename outmigration_rate_m outmigration_rate 
	replace outmigration_rate = outmigration/migration_pop_base_de if ///
	country=="DE" //Germany has special population base due to the ability 
	// to construct modern regions from district-level migration data
	
replace inmigration_rate = inmigration_rate*100
replace outmigration_rate = outmigration_rate*100

gen turnmig_rate=(inmigration_rate+outmigration_rate)/2
gen netmig_rate = inmigration_rate-outmigration_rate 	

* ------------------------------------------------------------------------------
* (j) Female Employment Shsare 
* ------------------------------------------------------------------------------

rename EMP_share_female ic_EMP_share_female
rename EMP_female ic_EMP_female
gen ic_EMP_share_female_2 = (ic_EMP_female/1000) / EMP*100 if year==init_yr_EMP_female 
	bys nuts: egen ic_EMP_share_female_m = min(ic_EMP_share_female_2)
	drop ic_EMP_share_female_2
	
	replace ic_EMP_share_female = ic_EMP_share_female_m if ic_EMP_share_female==.
	drop ic_EMP_share_female_m

* ------------------------------------------------------------------------------
* (k) Motorways density
* ------------------------------------------------------------------------------

gen log_road_density=log(motorways/area)

foreach var in log_road_density  {
	capture drop xtemp ytemp
	gen xtemp=`var' if year>=1975 & year<=1980
	egen ytemp=mean(`var'), by(nuts)
	capture drop  m`var'
	gen m`var'=ytemp
}
capture drop xtemp ytemp

* Impute road density for some UK regions
** UKF3
capture drop temp_xxx
egen temp_xxx=mean(mlog_road_density) if ///
	nuts=="UKF3" | nuts=="UKF1" | nuts=="UKF2" | nuts=="UKE1" | nuts=="UKH1"
replace mlog_road_density=temp_xxx if nuts=="UKF3" & mlog_road_density==.
capture drop temp_xxx
	
** UKK3
capture drop temp_xxx
egen temp_xxx=mean(mlog_road_density) if ///
	nuts=="UKK3" | nuts=="UKK4" 
replace mlog_road_density=temp_xxx if nuts=="UKK3" & mlog_road_density==.
capture drop temp_xxx

********************************************************************************
*Some clean up, fill in missing values 

replace ind_energy = 0 if ind_energy==. 

*replace GDP for FR82 with GDP from composite FR82&FR83 
sum gdp if nuts=="FR82&FR83" & year==1968
	replace gdp = r(mean) if nuts== "FR82"
sum gdp_per_capita if nuts=="FR82&FR83"
	replace gdp_per_capita = r(mean) if nuts=="FR82"


********************************************************************************

order id country nuts year POP* LF* UNEMP* EMP* ic_EMP* ue_rate* /// 
	ind* serv* age* gdp* edatt* inmigration* outmigration* migration_yrs ///
	motorways urban* _merge*
	
sort nuts year

saveold "$dta_files/step301_cleanup_ipolate.dta", replace



